\[\\[.4in]\]
library(dplyr)
library(ggplot2)
library(lubridate)
library(MMWRweek)
library(readr)
library(rlang)
library(magrittr)
library(aws.s3)
library(data.table)
library(dplyr)
library(DT)
library(ggplot2)
library(plotly)
library(readr)
library(stringr)
library(tidyr)
suppressPackageStartupMessages(source(here::here("R", "load_all.R")))
forecast_date <- as.Date("2024-11-20")
epi_data <- readr::read_csv("https://data.cdc.gov/resource/ua7e-t2fy.csv?$limit=20000&$select=weekendingdate,jurisdiction,totalconfc19newadm")
epi_data <- epi_data %>%
mutate(
epiweek = epiweek(weekendingdate),
epiyear = epiyear(weekendingdate)
) %>%
left_join(
(.) %>%
distinct(epiweek, epiyear) %>%
mutate(
season = convert_epiweek_to_season(epiyear, epiweek),
season_week = convert_epiweek_to_season_week(epiyear, epiweek)
),
by = c("epiweek", "epiyear")
)
epi_data <- epi_data %>%
mutate(time_value = as.Date(weekendingdate), geo_value = tolower(jurisdiction), nhsn = totalconfc19newadm) %>%
select(-weekendingdate, -jurisdiction, -totalconfc19newadm)
ahead <- 0
aheads <- -1:3
window_size <- 3
recent_window <- 3
geos_to_plot <- c("mn", "ca", "pa", "usa", "ri") #epi_data %>% pull(geo_value) %>% unique() %>% setdiff(c("as", "mp", "vi")) #
all_geos_to_plot <- epi_data %>% pull(geo_value) %>% unique() %>% setdiff(c("as", "mp", "vi"))
plot_quantiles <- c(0.6, 0.75, 0.95, 0.99)
quantile_alphas <- c(0.9, 0.6, 0.4, 0.3)
forecast_week <- epiweek(forecast_date)
last_date_data <- epi_data %>%
pull(time_value) %>%
max()
relevant_period <- epi_data %>% select(epiweek, time_value, season) %>%distinct %>% filter(
(abs(forecast_week + min(aheads) - epiweek) <= window_size) |
(abs(forecast_week + min(aheads) - epiweek) <= window_size) |
(last_date_data - time_value <= recent_window * 7),
season != "2024/25"
)
relevant_period %<>% group_by(season) %>% summarise(start = min(time_value), stop = max(time_value))
quantile_basic uses base R’s quantile’s, each geo
separately, using a window of 7 weeks (3 before and after the target
date), and the 3 weeks before the last week of dataepipred quantile is more or less the same, but using
the quantile extrapolation in epipredict after fitting the quantiles
.25, .5, and .75. The main difference is that the tails are more extreme
and can be outside the range of previously seen values for that
location.geo aggregated is a global fit, that first puts
everything on a rate scale, and then does a single global forecast, and
then returns to the original scales. It is just using base R
quantiles.The intervals here are the 75th, 95th, and 99th quantiles. Some notes:
quantile_basic <- lapply(-1:3, \(ahead) climatological_model(epi_data, forecast_date, ahead, disease = "covid")) %>%
bind_rows() %>%
mutate(geo_type = "state", forecaster = "quantile basic")
quantile_7 <- lapply(-1:3, \(ahead) climatological_model(epi_data, forecast_date, ahead, quant_type = 7, disease = "covid")) %>%
bind_rows() %>%
mutate(geo_type = "state", forecaster = "quantile_7")
epipred_extrapolation <- lapply(-1:3, \(ahead) climatological_model(epi_data, forecast_date, ahead, quantile_method = "epipredict", disease = "covid")) %>% bind_rows() %>%
mutate(geo_type = "state", forecaster = "epipred quantile")
geo_aggregate <- lapply(-1:3, \(ahead) climatological_model(epi_data, forecast_date, ahead, geo_agg = TRUE, disease = "covid")) %>% bind_rows() %>%
mutate(geo_type = "state", forecaster = "geo aggregated")
all_results <- bind_rows(
geo_aggregate,
quantile_basic,
epipred_extrapolation
)
small_truth_data <- epi_data %>%
mutate(value = nhsn, target_end_date = time_value, forecaster = "nhsn") %>%
filter(geo_value %in% geos_to_plot)
the_plot <-
all_results %>%
filter(geo_value %in% geos_to_plot) %>%
plot_forecasts(forecast_date, geo_type = "state", truth_data = small_truth_data, quantiles = plot_quantiles, alphas = quantile_alphas, relevant_period = relevant_period)
ggplotly(the_plot, tooltip = "text", height = 1500, width = 1600) %>%
layout(hoverlabel = list(bgcolor = "white"))
full_truth_data <- epi_data %>%
mutate(value = nhsn, target_end_date = time_value, forecaster = "nhsn") %>%
filter(geo_value %in% all_geos_to_plot)
the_plot <-
all_results %>%
filter(geo_value %in% all_geos_to_plot) %>%
plot_forecasts(forecast_date, geo_type = "state", truth_data = full_truth_data, quantiles = plot_quantiles, alphas = quantile_alphas, relevant_period = relevant_period)
ggplotly(the_plot, tooltip = "text", height = 5000, width = 1700) %>%
layout(hoverlabel = list(bgcolor = "white"))
These are
epipredict quantile abovequantile basic
abovethe_plot <- mean_climatological %>%
filter(geo_value %in% geos_to_plot) %>%
plot_forecasts(forecast_date, geo_type = "state", truth_data = small_truth_data, quantiles = plot_quantiles, alphas = quantile_alphas, relevant_period = relevant_period)
ggplotly(the_plot, tooltip = "text", height = 1500, width = 1600) %>%
layout(hoverlabel = list(bgcolor = "white"))
the_plot <- mean_climatological %>%
filter(geo_value %in% all_geos_to_plot) %>%
plot_forecasts(forecast_date, geo_type = "state", truth_data = full_truth_data, quantiles = plot_quantiles, alphas = quantile_alphas, relevant_period = relevant_period)
ggplotly(the_plot, tooltip = "text", height = 5000, width = 1600) %>%
layout(hoverlabel = list(bgcolor = "white"))
Since the dynamics are exponential, averaging on the log scale is probably a better method. That way, if the national is predicting something an order of magnitude higher that won’t swamp the local information. This is the same set using the geomean instead of the arithmetic mean.
"all three"the mean of all 3"epipred quantile"the mean of the geo aggregated and
the quantile basic above"quantile_basic"the mean of the geo aggregated and the
epipredict quantile abovethe_plot <- mean_climatological %>%
filter(geo_value %in% geos_to_plot) %>%
plot_forecasts(forecast_date, geo_type = "state", truth_data = small_truth_data, quantiles = plot_quantiles, alphas = quantile_alphas, relevant_period = relevant_period)
ggplotly(the_plot, tooltip = "text", height = 1500, width = 1600) %>%
layout(hoverlabel = list(bgcolor = "white"))
the_plot <- mean_climatological %>%
filter(geo_value %in% all_geos_to_plot) %>%
plot_forecasts(forecast_date, geo_type = "state", truth_data = full_truth_data, quantiles = plot_quantiles, alphas = quantile_alphas, relevant_period = relevant_period)
ggplotly(the_plot, tooltip = "text", height = 5000, width = 1600) %>%
layout(hoverlabel = list(bgcolor = "white"))
Given that those 2 years are clearly larger than whatever we’re settling into in 2022 and beyond, we should try a model that doesn’t include those years, which may be closer to something used this year
quantile_basic <- lapply(-1:3, \(ahead) climatological_model(epi_data, forecast_date, ahead)) %>%
bind_rows() %>%
mutate(geo_type = "state", forecaster = "quantile basic")
quantile_7 <- lapply(-1:3, \(ahead) climatological_model(epi_data, forecast_date, ahead, quant_type = 7)) %>%
bind_rows() %>%
mutate(geo_type = "state", forecaster = "quantile_7")
epipred_extrapolation <- lapply(-1:3, \(ahead) climatological_model(epi_data, forecast_date, ahead, quantile_method = "epipredict")) %>% bind_rows() %>%
mutate(geo_type = "state", forecaster = "epipred quantile")
geo_aggregate <- lapply(-1:3, \(ahead) climatological_model(epi_data, forecast_date, ahead, geo_agg = TRUE)) %>% bind_rows() %>%
mutate(geo_type = "state", forecaster = "geo aggregated")
all_results <- bind_rows(
geo_aggregate,
quantile_basic,
epipred_extrapolation
)
small_truth_data <- epi_data %>%
mutate(value = nhsn, target_end_date = time_value, forecaster = "nhsn") %>%
filter(geo_value %in% geos_to_plot)
the_plot <-
all_results %>%
filter(geo_value %in% geos_to_plot) %>%
plot_forecasts(forecast_date, geo_type = "state", truth_data = small_truth_data, quantiles = plot_quantiles, alphas = quantile_alphas, relevant_period = relevant_period)
ggplotly(the_plot, tooltip = "text", height = 1500, width = 1600) %>%
layout(hoverlabel = list(bgcolor = "white"))
full_truth_data <- epi_data %>%
mutate(value = nhsn, target_end_date = time_value, forecaster = "nhsn") %>%
filter(geo_value %in% all_geos_to_plot)
the_plot <-
all_results %>%
filter(geo_value %in% all_geos_to_plot) %>%
plot_forecasts(forecast_date, geo_type = "state", truth_data = full_truth_data, quantiles = plot_quantiles, alphas = quantile_alphas, relevant_period = relevant_period)
ggplotly(the_plot, tooltip = "text", height = 5000, width = 1700) %>%
layout(hoverlabel = list(bgcolor = "white"))